/* 
    Purpose: Using the 1% 1910-1990 Censuses,
             this file keeps fathers and their 
             family members. Median # of children 
             in the household and median # of family
             members are then calculated for each
             occupation x race x south cell. 

    Creates: NumberChildren_byCensus.dta
*/
clear
set more off
cd "$Mydirectory1/1_DataSources/CensusData/"

forval i=1910(10)1990 {

	* Import raw data
	use ./input/Census_1910to2010_1pct_raw.dta, clear //download from IPUMS USA
		
	* Keep the relevant year
		keep if year==`i'
		tab year
		
		tempfile fulldata
		save `fulldata'
		
		keep if age<18 //Restrict to children younger than 18
		keep serial poploc age
		
		replace poploc=. if poploc==0 
		drop if poploc==. //Exclude children without a father in the house
		
		bysort serial poploc: keep if _n==1
		rename poploc pernum
		drop age

		tempfile children 
		save `children'
		
	* Bring in fathers and match them to their children
		use `fulldata', clear
		merge 1:1 serial pernum using `children'
		assert _merge!=2
		
		gen father = _merge==3 
		drop _merge
		
		tab nchild if father==1
		
	* Keep households with white or black fathers aged 30-50 
		gen father_sample = father==1 & (age>=30 & age<=50) & (race==1 | race==2)
		bysort serial famunit: egen family = max(father_sample)
		keep if family==1
			
	* Count # of children in household and family size
	  /*Note: "# of children in household" 
	           is a broader measure than an 
	           "own children" measure. */
		gen child = age<18
		gen member=1
		
		bysort serial famunit: egen number_children_family = sum(child)
		bysort serial famunit: egen number_member_family = sum(member)
		
		rename nchild number_own_children
		rename famsize number_own_family
		
		drop child member
		
	* Now only keep fathers
		keep if father_sample==1
		assert race==1 | race==2
		assert age>=30 & age<=50
		
	**----------------------------------**

	*****************
	*** REGION VARIABLE
	*****************
		tab statefip, m
		rename statefip fips
		
	* Region of current residence   
		drop region

	* Northeast: Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont, New Jersey, New York, Pennsylvania
    * Midwest: Illinois, Indiana, Michigan, Ohio, Wisconsin, Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota, South Dakota
    /* South: Delaware, District of Columbia, Florida, Georgia, Maryland, North Carolina, South Carolina, Virginia, West Virginia, Alabama,
          Kentucky, Mississippi, Tennessee, Arkansas, Louisiana, Oklahoma, Texas */
    * West: Arizona, Colorado, Idaho, Montana, Nevada, New Mexico, Utah, Wyoming, California, Oregon, Washington --note: Census puts AK and HI in with Pacific division
		gen region=.
		replace region=1 if fips==9 | fips==23 | fips==25 | fips==33 | fips==44 | fips==50 | fips==34 | fips==36 | fips==42
		replace region=2 if fips==17 | fips==18 | fips==26 | fips==39 | fips==55 | fips==19 | fips==20 | fips==27 | fips==29 | fips==31 | fips==38 | fips==46 
		replace region=3 if fips==10 | fips==11 | fips==12 | fips==13 | fips==24 | fips==37 | fips==45 | fips==51 | fips==54 ///
		| fips==1 | fips==21 | fips==28 | fips==47 | fips==5 | fips==22 | fips==40 | fips==48 
		replace region=4 if fips==4 | fips==8 | fips==16 | fips==30 | fips==32 | fips==35 | fips==49 | fips==56 | fips==6 | fips==41 | fips==53 | fips==2 | fips==15 
		tab region, m 

		gen south_merge = region==3 if region<.
		tab south_merge, m

	**----------------------------------**

	****************
	**** ASSIGN COARSENED OCCS
	****************

	* Set up variables
		sort occ1950
		replace occ1950=. if occ1950>=980

	* Separate people with occupations in 200's based on self-employment
		replace occ1950=occ1950+1000 if (occ1950>=200 & occ1950<=290) & classwkr==1
		
	* Crosswalk Census occupations to coarsened ANES occupations
		merge m:1 occ1950 using ../Crosswalks/Crosswalk_1950Census_toANES.dta
		assert occ1950==. if _merge==1
		drop if _merge==2
		drop _merge
		
	**----------------------------------**
	**----------------------------------**

	*******************************************************
	*** PRELIMINARY STEP: COLLAPSE AT RACE X SOUTH LEVEL
	*******************************************************
		global vars_list "number_own_family number_own_children number_children_family number_member_family"
		
	/* Drop people with missing occupation, south, 
	   or race not black or white */
		drop if occ1950ej==. | south_merge==. | race>2
		
	/* Acccountants are missing from 
	   1940 microdata. Add them. */
		if "`i'"=="1940" {
			set obs `=_N+1'
			replace occ1950ej=1 if occ1950ej==.
			replace race=1 if occ1950ej==1
		}
			
	/* Obtain median family size measures 
	   (at race x south level) */
	preserve

		collapse (p50) $vars_list [aw=perwt], by(race south_merge) 
		
		foreach c in $vars_list {
		rename `c' `c'_racesouth
		}
		
		tempfile racesouth
		save `racesouth'

	restore

	****************************************************
	*** COLLAPSE (OCC X RACE x SOUTH)
	****************************************************
	
	*************************
	*** TEMPLATES 
	*************************

	* Template 1: occ x race
	preserve

		collapse (min) race, by(occ1950ej)
			
		expand 2
		bysort occ1950ej: replace race=2 if _n==2

		tempfile occbyrace
		save `occbyrace'

	restore 
		
	* Template 2: occ x race x south
	preserve 
		
		drop if south_merge==. //will flag (later) which occ x race x south-level observations must be imputed 
		collapse (min) race, by(south_merge)
		expand 2
		bysort south_merge: replace race=2 if _n==2
		
		tempfile south
		save `south'
		
	restore
		
	* Combine templates 1 & 2
	preserve

		use `occbyrace', clear
		joinby race using `south'
		
		tempfile template
		save `template'
		
	restore
	
	* Obtain median family size measures (at occ x race x south level)	
		gen number=1
		collapse (rawsum) number (p50) $vars_list [aw=perwt], by(occ1950ej race south_merge) 
		
		tempfile number
		save `number'

		use `template'
		merge 1:1 occ1950ej race south_merge using `number'

	**************************	
	* IMPUTATIONS
	**************************		
	* Dummy: observations that must be imputed 
		gen imputed = _merge==1
		drop _merge
		
	* Assign race x south value for missing occ x race x south cells
		merge m:1 race south_merge using `racesouth'
		
		foreach c in $vars_list {
		replace `c'=`c'_racesouth if `c'==.
		}
		drop *_racesouth _merge
		
	* Label variables	
		label var number_children_family "Median number of children under 18 in family unit"
		label var number_member_family "Median number of members in family unit"
		label var number_own_children "Median number of own children in household (using nchild)"
		label var number_own_family "Median number of own family members in household (using famsize)"
		label var imputed "Observation imputed using race x south numbers"
		
		replace number=0 if number==.
		label var number "Number of obs in occ x race x south cell"
		
		gen year_census=`i'
		label var year_census "Year of Census"
		
		tempfile stats`i'
		save `stats`i''
	
}

* Append files by Census decade
	use `stats1910'
	forval i=1920(10)1990 {
		append using `stats`i''
	}
	
	compress
	sort year_census occ1950ej race south_merge
	save ./output/NumberChildren_byCensus.dta, replace
